package org.apache.solr.handler.component;

import java.io.IOException;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.TermVectorParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.core.SolrCore;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.ReturnFields;
import org.apache.solr.search.DocList;
import org.apache.solr.search.DocListAndSet;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.SolrCoreAware;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * Return term vectors for the documents in a query result set.
 * <p/>
 * Info available: term, frequency, position, offset, IDF.
 * <p/>
 * <b>Note</b> Returning IDF can be expensive.
 * 
 * <pre class="prettyprint">
 * &lt;searchComponent name="tvComponent" class="solr.TermVectorComponent"/&gt;
 * 
 * &lt;requestHandler name="/terms" class="solr.SearchHandler"&gt;
 *   &lt;lst name="defaults"&gt;
 *     &lt;bool name="tv"&gt;true&lt;/bool&gt;
 *   &lt;/lst&gt;
 *   &lt;arr name="last-component"&gt;
 *     &lt;str&gt;tvComponent&lt;/str&gt;
 *   &lt;/arr&gt;
 * &lt;/requestHandler&gt;
 * </pre>
 * 
 * 
 */
public class TermVectorComponent extends SearchComponent implements
		SolrCoreAware {

	public static final String COMPONENT_NAME = "tv";

	protected NamedList initParams;
	public static final String TERM_VECTORS = "termVectors";

	/**
	 * Helper method for determining the list of fields that we should try to
	 * find term vectors on.
	 * <p>
	 * Does simple (non-glob-supporting) parsing on the
	 * {@link TermVectorParams#FIELDS} param if specified, otherwise it returns
	 * the concrete field values specified in {@link CommonParams#FL} --
	 * ignoring functions, transformers, or literals.
	 * </p>
	 * <p>
	 * If "fl=*" is used, or neither param is specified, then <code>null</code>
	 * will be returned. If the empty set is returned, it means the "fl"
	 * specified consisted entirely of things that are not real fields (ie:
	 * functions, transformers, partial-globs, score, etc...) and not supported
	 * by this component.
	 * </p>
	 */
	private Set<String> getFields(ResponseBuilder rb) {
		SolrParams params = rb.req.getParams();
		String[] fldLst = params.getParams(TermVectorParams.FIELDS);
		if (null == fldLst || 0 == fldLst.length
				|| (1 == fldLst.length && 0 == fldLst[0].length())) {

			// no tv.fl, parse the main fl
			ReturnFields rf = new ReturnFields(
					params.getParams(CommonParams.FL), rb.req);

			if (rf.wantsAllFields()) {
				return null;
			}

			Set<String> fieldNames = rf.getLuceneFieldNames();
			return (null != fieldNames) ? fieldNames :
			// return empty set indicating no fields should be used
					Collections.<String> emptySet();
		}

		// otherwise us the raw fldList as is, no special parsing or globs
		Set<String> fieldNames = new LinkedHashSet<String>();
		for (String fl : fldLst) {
			fieldNames.addAll(Arrays.asList(SolrPluginUtils.split(fl)));
		}
		return fieldNames;
	}

	@Override
	public void process(ResponseBuilder rb) throws IOException {
		SolrParams params = rb.req.getParams();
		if (!params.getBool(COMPONENT_NAME, false)) {
			return;
		}

		NamedList<Object> termVectors = new NamedList<Object>();
		rb.rsp.add(TERM_VECTORS, termVectors);

		IndexSchema schema = rb.req.getSchema();
		SchemaField keyField = schema.getUniqueKeyField();
		String uniqFieldName = null;
		if (keyField != null) {
			uniqFieldName = keyField.getName();
			termVectors.add("uniqueKeyFieldName", uniqFieldName);
		}

		FieldOptions allFields = new FieldOptions();
		// figure out what options we have, and try to get the appropriate
		// vector
		allFields.termFreq = params.getBool(TermVectorParams.TF, false);
		allFields.positions = params.getBool(TermVectorParams.POSITIONS, false);
		allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false);
		allFields.docFreq = params.getBool(TermVectorParams.DF, false);
		allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
		// boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
		// short cut to all values.
		if (params.getBool(TermVectorParams.ALL, false)) {
			allFields.termFreq = true;
			allFields.positions = true;
			allFields.offsets = true;
			allFields.docFreq = true;
			allFields.tfIdf = true;
		}

		// Build up our per field mapping
		Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>();
		NamedList<List<String>> warnings = new NamedList<List<String>>();
		List<String> noTV = new ArrayList<String>();
		List<String> noPos = new ArrayList<String>();
		List<String> noOff = new ArrayList<String>();

		Set<String> fields = getFields(rb);
		if (null != fields) {
			// we have specific fields to retrieve, or no fields
			for (String field : fields) {

				// workarround SOLR-3523
				if (null == field || "score".equals(field))
					continue;

				// we don't want to issue warnings about the uniqueKey field
				// since it can cause lots of confusion in distributed requests
				// where the uniqueKey field is injected into the fl for merging
				final boolean fieldIsUniqueKey = field.equals(uniqFieldName);

				SchemaField sf = schema.getFieldOrNull(field);
				if (sf != null) {
					if (sf.storeTermVector()) {
						FieldOptions option = fieldOptions.get(field);
						if (option == null) {
							option = new FieldOptions();
							option.fieldName = field;
							fieldOptions.put(field, option);
						}
						// get the per field mappings
						option.termFreq = params.getFieldBool(field,
								TermVectorParams.TF, allFields.termFreq);
						option.docFreq = params.getFieldBool(field,
								TermVectorParams.DF, allFields.docFreq);
						option.tfIdf = params.getFieldBool(field,
								TermVectorParams.TF_IDF, allFields.tfIdf);
						// Validate these are even an option
						option.positions = params
								.getFieldBool(field,
										TermVectorParams.POSITIONS,
										allFields.positions);
						if (option.positions && !sf.storeTermPositions()
								&& !fieldIsUniqueKey) {
							noPos.add(field);
						}
						option.offsets = params.getFieldBool(field,
								TermVectorParams.OFFSETS, allFields.offsets);
						if (option.offsets && !sf.storeTermOffsets()
								&& !fieldIsUniqueKey) {
							noOff.add(field);
						}
					} else {// field doesn't have term vectors
						if (!fieldIsUniqueKey)
							noTV.add(field);
					}
				} else {
					// field doesn't exist
					throw new SolrException(
							SolrException.ErrorCode.BAD_REQUEST,
							"undefined field: " + field);
				}
			}
		} // else, deal with all fields

		// NOTE: currently all typs of warnings are schema driven, and garunteed
		// to be consistent across all shards - if additional types of warnings
		// are added that might be differnet between shards, finishStage() needs
		// to be changed to account for that.
		boolean hasWarnings = false;
		if (!noTV.isEmpty()) {
			warnings.add("noTermVectors", noTV);
			hasWarnings = true;
		}
		if (!noPos.isEmpty()) {
			warnings.add("noPositions", noPos);
			hasWarnings = true;
		}
		if (!noOff.isEmpty()) {
			warnings.add("noOffsets", noOff);
			hasWarnings = true;
		}
		if (hasWarnings) {
			termVectors.add("warnings", warnings);
		}

		DocListAndSet listAndSet = rb.getResults();
		List<Integer> docIds = getInts(params
				.getParams(TermVectorParams.DOC_IDS));
		Iterator<Integer> iter;
		if (docIds != null && !docIds.isEmpty()) {
			iter = docIds.iterator();
		} else {
			DocList list = listAndSet.docList;
			iter = list.iterator();
		}
		SolrIndexSearcher searcher = rb.req.getSearcher();

		IndexReader reader = searcher.getIndexReader();
		// the TVMapper is a TermVectorMapper which can be used to optimize
		// loading of Term Vectors

		// Only load the id field to get the uniqueKey of that
		// field

		final String finalUniqFieldName = uniqFieldName;

		final List<String> uniqValues = new ArrayList<String>();

		// TODO: is this required to be single-valued? if so, we should STOP
		// once we find it...
		final StoredFieldVisitor getUniqValue = new StoredFieldVisitor() {
			@Override
			public void stringField(FieldInfo fieldInfo, String value) {
				uniqValues.add(value);
			}

			@Override
			public void intField(FieldInfo fieldInfo, int value) {
				uniqValues.add(Integer.toString(value));
			}

			@Override
			public void longField(FieldInfo fieldInfo, long value) {
				uniqValues.add(Long.toString(value));
			}

			@Override
			public Status needsField(FieldInfo fieldInfo) {
				return (fieldInfo.name.equals(finalUniqFieldName)) ? Status.YES
						: Status.NO;
			}
		};

		TermsEnum termsEnum = null;

		while (iter.hasNext()) {
			Integer docId = iter.next();
			NamedList<Object> docNL = new NamedList<Object>();

			if (keyField != null) {
				reader.document(docId, getUniqValue);
				String uniqVal = null;
				if (uniqValues.size() != 0) {
					uniqVal = uniqValues.get(0);
					uniqValues.clear();
					docNL.add("uniqueKey", uniqVal);
					termVectors.add(uniqVal, docNL);
				}
			} else {
				// support for schemas w/o a unique key,
				termVectors.add("doc-" + docId, docNL);
			}

			if (null != fields) {
				for (Map.Entry<String, FieldOptions> entry : fieldOptions
						.entrySet()) {
					final String field = entry.getKey();
					final Terms vector = reader.getTermVector(docId, field);
					if (vector != null) {
						termsEnum = vector.iterator(termsEnum);
						mapOneVector(docNL, entry.getValue(), reader, docId,
								vector.iterator(termsEnum), field);
					}
				}
			} else {
				// extract all fields
				final Fields vectors = reader.getTermVectors(docId);
				for (String field : vectors) {
					Terms terms = vectors.terms(field);
					if (terms != null) {
						termsEnum = terms.iterator(termsEnum);
						mapOneVector(docNL, allFields, reader, docId,
								termsEnum, field);
					}
				}
			}
		}
	}

	private void mapOneVector(NamedList<Object> docNL,
			FieldOptions fieldOptions, IndexReader reader, int docID,
			TermsEnum termsEnum, String field) throws IOException {
		NamedList<Object> fieldNL = new NamedList<Object>();
		docNL.add(field, fieldNL);

		BytesRef text;
		DocsAndPositionsEnum dpEnum = null;
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			NamedList<Object> termInfo = new NamedList<Object>();
			fieldNL.add(term, termInfo);
			final int freq = (int) termsEnum.totalTermFreq();
			if (fieldOptions.termFreq == true) {
				termInfo.add("tf", freq);
			}

			dpEnum = termsEnum.docsAndPositions(null, dpEnum);
			boolean useOffsets = false;
			boolean usePositions = false;
			if (dpEnum != null) {
				dpEnum.nextDoc();
				usePositions = fieldOptions.positions;
				useOffsets = fieldOptions.offsets;
			}

			NamedList<Integer> positionsNL = null;
			NamedList<Number> theOffsets = null;

			if (usePositions || useOffsets) {
				for (int i = 0; i < freq; i++) {
					final int pos = dpEnum.nextPosition();
					if (usePositions && pos >= 0) {
						if (positionsNL == null) {
							positionsNL = new NamedList<Integer>();
							termInfo.add("positions", positionsNL);
						}
						positionsNL.add("position", pos);
					}

					if (useOffsets && theOffsets == null) {
						if (dpEnum.startOffset() == -1) {
							useOffsets = false;
						} else {
							theOffsets = new NamedList<Number>();
							termInfo.add("offsets", theOffsets);
						}
					}

					if (theOffsets != null) {
						theOffsets.add("start", dpEnum.startOffset());
						theOffsets.add("end", dpEnum.endOffset());
					}
				}
			}

			int df = 0;
			if (fieldOptions.docFreq || fieldOptions.tfIdf) {
				df = reader.docFreq(new Term(field, text));
			}

			if (fieldOptions.docFreq) {
				termInfo.add("df", df);
			}

			// TODO: this is not TF/IDF by anyone's definition!
			if (fieldOptions.tfIdf) {
				double tfIdfVal = ((double) freq) / df;
				termInfo.add("tf-idf", tfIdfVal);
			}
		}
	}

	private List<Integer> getInts(String[] vals) {
		List<Integer> result = null;
		if (vals != null && vals.length > 0) {
			result = new ArrayList<Integer>(vals.length);
			for (int i = 0; i < vals.length; i++) {
				try {
					result.add(new Integer(vals[i]));
				} catch (NumberFormatException e) {
					throw new SolrException(
							SolrException.ErrorCode.BAD_REQUEST,
							e.getMessage(), e);
				}
			}
		}
		return result;
	}

	@Override
	public void prepare(ResponseBuilder rb) throws IOException {

	}

	@Override
	public void finishStage(ResponseBuilder rb) {
		if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {

			NamedList termVectors = new NamedList<Object>();
			Map.Entry<String, Object>[] arr = new NamedList.NamedListEntry[rb.resultIds
					.size()];

			for (ShardRequest sreq : rb.finished) {
				if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) == 0
						|| !sreq.params.getBool(COMPONENT_NAME, false)) {
					continue;
				}
				for (ShardResponse srsp : sreq.responses) {
					NamedList<Object> nl = (NamedList<Object>) srsp
							.getSolrResponse().getResponse().get(TERM_VECTORS);
					for (int i = 0; i < nl.size(); i++) {
						String key = nl.getName(i);
						ShardDoc sdoc = rb.resultIds.get(key);
						if (null == sdoc) {
							// metadata, only need from one node, leave in order
							if (termVectors.indexOf(key, 0) < 0) {
								termVectors.add(key, nl.getVal(i));
							}
						} else {
							int idx = sdoc.positionInResponse;
							arr[idx] = new NamedList.NamedListEntry<Object>(
									key, nl.getVal(i));
						}
					}
				}
			}
			// remove nulls in case not all docs were able to be retrieved
			termVectors.addAll(SolrPluginUtils
					.removeNulls(new NamedList<Object>(arr)));
			rb.rsp.add(TERM_VECTORS, termVectors);
		}
	}

	// ////////////////////// NamedListInitializedPlugin methods
	// //////////////////////

	@Override
	public void init(NamedList args) {
		super.init(args);
		this.initParams = args;
	}

	public void inform(SolrCore core) {

	}

	@Override
	public String getSource() {
		return "$URL: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene_solr_4_0/solr/core/src/java/org/apache/solr/handler/component/TermVectorComponent.java $";
	}

	@Override
	public String getDescription() {
		return "A Component for working with Term Vectors";
	}
}

class FieldOptions {
	String fieldName;
	boolean termFreq, positions, offsets, docFreq, tfIdf;
}
